package spider.web.regex;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

/**
 * @author: zhangzeli
 * @date 17:59 2018/5/5
 * <P></P>
 */
public class WepSpiderTest {
    public static String getURLconent(String durl,Charset set){
        StringBuffer sb = new StringBuffer();
        try {
            URL url = new URL(durl);
            BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(),set));
            String temp ="";
            while ((temp=reader.readLine())!=null){
                sb.append(temp);
            }
        }catch (Exception e){
            e.printStackTrace();
        }
        return sb.toString();
    }

    public static void main(String[] args) {
        String urLconent = getURLconent("http://www.163.com/", Charset.forName("gbk"));

        try {
//            Pattern regex = Pattern.compile("<a.+?</a>");
            Pattern regex = Pattern.compile("href=\"([\\w\\s./:]+?)\"");
            Matcher regexMatcher = regex.matcher(urLconent);
            while(regexMatcher.find()){
                System.out.println(regexMatcher.group(1));
            }
        } catch (PatternSyntaxException ex) {
            // Syntax error in the regular expression
        }

    }
}
